In this document we present the joint analysis of the PASS1A metabolomics datasets.
Load the data from the cloud, including: phenotypic data, metabolomic datasets, and metabolomics dictionary.
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/supervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/gcp_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/association_analysis_methods.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/data_aux_functions.R")
source("~/Desktop/repos/motrpac/tools/prediction_ml_tools.R")
library(randomForest) # for classification tests
# Load the dmaqc data
merged_dmaqc_data = load_from_bucket("merged_dmaqc_data2019-10-15.RData",
"gs://bic_data_analysis/pass1a/pheno_dmaqc/",F)
merged_dmaqc_data = merged_dmaqc_data[[1]]
rownames(merged_dmaqc_data) = as.character(merged_dmaqc_data$vial_label)
# define the tissue variable
merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# define the time to freeze variable
merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min +
merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min
# col time vs. control
# df = data.frame(
# bid = merged_dmaqc_data$bid,
# edta_col_time = merged_dmaqc_data$calculated.variables.edta_coll_time,
# time_to_freeze = merged_dmaqc_data$time_to_freeze,
# is_control = merged_dmaqc_data$animal.key.is_control,
# tp = merged_dmaqc_data$animal.key.timepoint,
# tissue = merged_dmaqc_data$specimen.processing.sampletypedescription
# )
# df = unique(df)
# boxplot(edta_col_time/3600 ~ is_control,df)
# boxplot(edta_col_time/3600 - tp ~ is_control,df)
# wilcox.test(edta_col_time/3600 ~ is_control,df)
# blood freeze times
blood_samples =
merged_dmaqc_data$specimen.processing.sampletypedescription ==
"EDTA Plasma"
blood_freeze_time =
as.difftime(merged_dmaqc_data$specimen.processing.t_freeze,units = "mins") -
as.difftime(merged_dmaqc_data$specimen.processing.t_edtaspin,units="mins")
blood_freeze_time = as.numeric(blood_freeze_time)
time_to_freeze = merged_dmaqc_data$time_to_freeze[blood_samples] =
blood_freeze_time[blood_samples]
# Load our parsed metabolomics datasets
metabolomics_bucket_obj = load_from_bucket(
file = "metabolomics_parsed_datasets_pass1a_external1.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
metabolomics_parsed_datasets = metabolomics_bucket_obj$metabolomics_parsed_datasets
Define the variables to be adjusted for:
biospec_cols = c(
"acute.test.distance",
"calculated.variables.time_to_freeze",
# "calculated.variables.edta_coll_time", # no need - see code above for blood
"bid" # required for matching datasets
)
differential_analysis_cols = c(
"animal.registration.sex",
"animal.key.timepoint",
"animal.key.is_control"
)
pipeline_qc_cols = c("sample_order")
Some sites do not use the log transformation on their dataset. In this section we plot the coefficient of variation as a function of the mean instensity. We take a single dataset as an example to show how log-transformed data have reduced dependency and smoother plots.
As an additional analysis we also plot the number of missing values per metabolite as a function of its mean intensity. We show that while there is high correlation some missing values appear in fairely high intensities. This is important for imputation as some sites use some fixed low value instead of knn imputation.
# Plot cv vs means
library(gplots)
d = metabolomics_parsed_datasets[["white_adipose_powder,metab_u_hilicpos,unnamed"]]
dx = d$sample_data
CoV<-function(x){return(sd(x,na.rm = T)/mean(x,na.rm=T))}
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Raw data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Repeat after log2
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Log2 data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Plot number of NAs vs intensity mean
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
num_nas = rowSums(is.na(dx))
df = data.frame(Num_NAs = num_nas[inds],Mean_intensity = dmeans[inds])
rho = cor(df$Num_NAs,df$Mean_intensity,method="spearman")
rho = format(rho,digits=2)
plot(Num_NAs~Mean_intensity,df,cex=0.5,pch=20,
main=paste("Spearman:",rho))
metabolomics_processed_datasets = load_from_bucket(
file="metabolomics_processed_datasets11182019.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/"
)[[1]]
# Reduce the metadata to the selected columns
for(currname in names(metabolomics_processed_datasets)){
curr_data = metabolomics_processed_datasets[[currname]]$normalized_data[[1]]
# organize the metadata
curr_meta = merged_dmaqc_data[colnames(curr_data),
union(biospec_cols,differential_analysis_cols)]
# remove metadata variables with too many NAs
na_counts = apply(is.na(curr_meta),2,sum)
curr_meta = curr_meta[,na_counts/nrow(curr_meta) < 0.1]
metabolomics_processed_datasets[[currname]]$sample_meta_parsed = curr_meta
}
Untargeted data are typically log-transformed and analyzed using linear models. On the other hand, concentration data are sometimes analyzed with the same type of models but using the original data. This raises a problem if we wish to compare exact statistics from these data. In this section we perform residual analysis for single metabolites. Our goal is to identify if concentration data behaves “normally” when not log-transformed. The analysis below examines the residuals of the data after fitting linear models for each metabolite, adjusting for freeze time and sex. We then compare the results with and without the log-transformation, counting the number of metabolites with a significant evidence for non-normally distributed residuals.
# check for normality using the Kolmogorov-Smirnov test
is_normal_test<-function(v){
if(sd(v,na.rm = T)==0){return(0)}
try({return(shapiro.test(v)$p.value)})
# The Shapiro test may fail if the sd of v is zero
return(ks.test(v,"pnorm",mean(v,na.rm=T),sd(v,na.rm = T))$p.value)
}
# go over the named datasets, get a logged and an unlogged version of
# the data, use these as inputs for the regression
residual_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
if(!metabolomics_processed_datasets[[nn1]]$is_targeted){next}
x_log = metabolomics_processed_datasets[[nn1]]$normalized_data[[1]]
x_unlog = 2^x_log
# take the covariates, ignore distances
x_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
curr_covs = x_meta[,intersect(colnames(x_meta),biospec_cols[2])]
curr_covs = data.frame(curr_covs,
sex=x_meta$animal.registration.sex)
# get the lm objects
curr_models = list()
for(tp in unique(x_meta$animal.key.timepoint)){
res_log = apply(
x_log,1,
pass1a_simple_differential_abundance,
tps = x_meta$animal.key.timepoint,tp=tp,
is_control = x_meta$animal.key.is_control,
covs = curr_covs,return_model=T
)
res_unlog = apply(
x_unlog,1,
pass1a_simple_differential_abundance,
tps = x_meta$animal.key.timepoint,tp=tp,
is_control = x_meta$animal.key.is_control,
covs = curr_covs,return_model=T
)
is_norm = cbind(
sapply(res_log,function(x)is_normal_test(residuals(x))),
sapply(res_unlog,function(x)is_normal_test(residuals(x)))
)
colnames(is_norm) = c("log","not log")
curr_models[[as.character(tp)]] = is_norm
# # test a specific model
# ind = 246
# plot(x_unlog[ind,],x_log[ind,])
#
# resids_log = rstandard(res_log[[ind]])
# fit_log = res_log[[ind]]$fitted.values
# resids_unlog = rstandard(res_unlog[[ind]])
# fit_unlog = res_unlog[[ind]]$fitted.values
#
# plot(fit_log,resids_log)
# plot(fit_unlog,resids_unlog)
}
residual_analysis_results[[nn1]] = curr_models
}
# Is there a significant difference between the two options?
log_vs_unlog_summ_mat = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)
wilcox.test(y[,1],y[,2],paired = T,alternative = "g")$p.value))
# Count the number of non-normal metabolites
num_nonnormal_log = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)sum(y[,1]<0.05)))
num_nonnormal_log =
num_nonnormal_log[,order(colnames(num_nonnormal_log))]
num_nonnormal_unlog = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)sum(y[,2]<0.05)))
num_nonnormal_unlog =
num_nonnormal_unlog[,order(colnames(num_nonnormal_unlog))]
library(corrplot)
par(mar = c(5,5,5,10))
normdiffs = t(num_nonnormal_log)- t(num_nonnormal_unlog)
corrplot(normdiffs,is.corr = F,tl.cex = 0.7)
Compare overlaps, effect sizes, and correlations within tissues. Compare targeted-untargeted pairs only. For differential analysis we use the same model as in the analysis above.
# Transform the data matrix to have compound names as row names.
# This requires removing rows without names and changing the rownames of
# the input matrix x.
# Also, do not assume that the row annotation orde fits the data necessarily
extract_by_comp_name_from_row_annot<-function(x,row_annot_x){
# get the column that has the row names of x or at least
# have the greatest intersection with it
int_sizes = apply(row_annot_x,2,function(x,y)length(intersect(x,y)),y=rownames(x))
ind = which(int_sizes==max(int_sizes,na.rm = T))[1]
# update the annotation table to have only rows that have
# a row in x and then update the rownames to be from the
# selected column
row_annot_x = row_annot_x[is.element(row_annot_x[,ind],set=rownames(x)),]
rownames(row_annot_x) = row_annot_x[,ind]
# we can now intersect x and the annotation using their row names
# and update x accordingly
shared = intersect(rownames(row_annot_x),rownames(x))
x = x[shared,]
row_annot_x = row_annot_x[shared,]
rownames(x) = row_annot_x$motrpac_comp_name
return(x)
}
tar_vs_untar_norm_pairs = list(
c("log2,imp","log2,imp,TMM"),
c("log2,imp","log2,imp"),
c("log2,imp","med,log,imp"),
c("imp,none","imp,none"),
c("imp,none","imp,TMM"),
c("imp,none","med,log,imp")
)
# The loop below is the core of the computations for comparing
# targeted and untargeted data when using known compound names
#
# For each normalization option (a pair from the list above) we compute
# all pairwise correlation matrices of the overlapping metabolites and
# the differential analysis results (again of the overlap).
# These objects are then used later for other quantitative comparisons
norm_method2comparison_results = list()
for(norm_pairs in tar_vs_untar_norm_pairs){
tar_norm_method = norm_pairs[1]
untar_norm_method = norm_pairs[2]
single_metabolite_corrs = list()
single_metabolite_de = c()
named2covered_shared_metabolites = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = metabolomics_processed_datasets[[nn1]]$tissue
if(!metabolomics_processed_datasets[[nn1]]$is_targeted){next}
single_metabolite_corrs[[nn1]] = list()
named2covered_shared_metabolites[[nn1]] = NULL
for(nn2 in names(metabolomics_processed_datasets)){
if(nn2 == nn1){next}
if(metabolomics_processed_datasets[[nn2]]$is_targeted){next}
nn2_tissue = metabolomics_processed_datasets[[nn2]]$tissue
nn2_dataset = paste(strsplit(nn2,split=",")[[1]][3:4],collapse = ",")
if(nn1_tissue!=nn2_tissue){next}
# get the numeric datasets and their annotation
x = metabolomics_processed_datasets[[nn1]]$normalized_data[[tar_norm_method]]
y = metabolomics_processed_datasets[[nn2]]$normalized_data[[untar_norm_method]]
row_annot_x = metabolomics_processed_datasets[[nn1]]$row_annot
row_annot_y = metabolomics_processed_datasets[[nn2]]$row_annot
# transform metabolite names to the motrpac comp name
x = extract_by_comp_name_from_row_annot(x,row_annot_x)
y = extract_by_comp_name_from_row_annot(y,row_annot_y)
# align the sample sets
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# step 1: merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
# step 2: use the shared bio ids
shared_bids = as.character(intersect(colnames(y),colnames(x)))
x = as.matrix(x[,shared_bids])
y = as.matrix(y[,shared_bids])
# At this point x and y are over the same BIDs, now we add the metadata
y_meta =
unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[shared_bids,]
# If one dataset is log-transformed and the other is not
# transform back to the original values
is_tar_log = grepl("log",tar_norm_method)
is_untar_log = grepl("log",untar_norm_method)
if(is_tar_log && !is_untar_log){
x = 2^x
}
if(!is_tar_log && is_untar_log){
y = 2^y
}
# If data are not log-transformed then scale the rows
if(!is_tar_log || !is_untar_log){
x = t(scale(t(x),center = T,scale = T))
y = t(scale(t(y),center = T,scale = T))
}
# get the shared matebolites
shared_metabolites = intersect(rownames(x),rownames(y))
shared_metabolites = na.omit(shared_metabolites)
if(length(shared_metabolites)==0){next}
named2covered_shared_metabolites[[nn1]] = union(
named2covered_shared_metabolites[[nn1]],
shared_metabolites
)
# Compute the correlation matrices of the shared metabolites
if(length(shared_metabolites)>1){
corrs =cor(t(x[shared_metabolites,]),
t(y[shared_metabolites,]),method = "spearman")
}
else{
corrs = cor(x[shared_metabolites,],
y[shared_metabolites,],method = "spearman")
}
# take the covariates (ignore distances)
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
curr_covs$sex = y_meta$animal.registration.sex # add sex
# differential analysis
for(tp in unique(y_meta$animal.key.timepoint)){
curr_control_tp = NULL
# if(tp == 7 || tp == 4){curr_control_tp=7}
resx = t(apply(
matrix(x[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F,
control_tp = curr_control_tp
))
resy = t(apply(
matrix(y[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F,
control_tp = curr_control_tp
))
# Add dataset information, time point, tissue
# These are important annotations for our summary matrix
# called single_metabolite_de below
added_columns = matrix(cbind(
rep(nn1,length(shared_metabolites)),
rep(nn2,length(shared_metabolites)),
shared_metabolites,
rep(tp,length(shared_metabolites)),
rep(nn1_tissue,length(shared_metabolites))
),nrow=length(shared_metabolites))
resx = cbind(resx,rep(T,nrow(resx)))
colnames(resx)[ncol(resx)] = "is_targeted"
resy = cbind(resy,rep(F,nrow(resy)))
colnames(resy)[ncol(resy)] = "is_targeted"
if(nrow(resx)>1){
resx = cbind(added_columns[,-2],resx)
resy = cbind(added_columns[,-1],resy)
}
else{
resx = c(added_columns[,-2],resx)
resy = c(added_columns[,-1],resy)
}
single_metabolite_de = rbind(single_metabolite_de,resx)
single_metabolite_de = rbind(single_metabolite_de,resy)
}
single_metabolite_corrs[[nn1]][[nn2]] = corrs
}
}
# Reformat the differential analysis results for easier comparison later
single_metabolite_de = data.frame(single_metabolite_de)
names(single_metabolite_de) = c("dataset","metabolite","tp","tissue",
"Est","Std","Tstat","Pvalue","is_targeted")
for(col in names(single_metabolite_de)[-c(1:4)]){
single_metabolite_de[[col]] = as.numeric(
as.character(single_metabolite_de[[col]]))
}
for(col in names(single_metabolite_de)[1:4]){
single_metabolite_de[[col]] =
as.character(single_metabolite_de[[col]])
}
# Remove duplications
# Rounding the p-values - necessary for removing duplicates
# using the unique function
rownames(single_metabolite_de) = NULL
for(nn in names(single_metabolite_de)){
ndig = 5
if(grepl("pval",nn,ignore.case = T)){
ndig = 10
}
if(is.numeric(single_metabolite_de[[nn]])){
single_metabolite_de[[nn]] =
round(single_metabolite_de[[nn]],digits = ndig)
}
}
single_metabolite_de = unique(single_metabolite_de)
# Finally, store all the results for the current normalization pair
pair_name = paste(tar_norm_method,untar_norm_method,sep=";")
norm_method2comparison_results[[pair_name]] = list(
single_metabolite_de = single_metabolite_de,
single_metabolite_corrs = single_metabolite_corrs,
named2covered_shared_metabolites = named2covered_shared_metabolites
)
}
We next transform the data above into tables that contain data for each combination of metabolite, time point, and tissue. These are then used for different meta-analyses: (1) a simple random effects analysis, (2) random effects with a binary covariate indicating if a dataset is targeted or untargeted, (3) redo the RE model of (1) with the targeted data only, and (4) redo the RE model of (1) with the untargeted data only.
library(metafor)
for(pair_name in names(norm_method2comparison_results)){
meta_analysis_stats = list()
single_metabolite_de =
norm_method2comparison_results[[pair_name]]$single_metabolite_de
for(tissue in unique(single_metabolite_de$tissue)){
for(tp in unique(single_metabolite_de$tp)){
curr_subset = single_metabolite_de[
single_metabolite_de$tissue==tissue &
single_metabolite_de$tp==tp,]
for(metabolite in unique(curr_subset$metabolite)){
curr_met_data = curr_subset[
curr_subset$metabolite==metabolite,]
curr_met_data$var = curr_met_data$Std^2
re_model1 = NULL;re_model2=NULL
re_model_tar = NULL;re_model_untar = NULL
try({re_model1 = rma.uni(curr_met_data$Est,curr_met_data$var,method="FE")})
try({re_model2 = rma.mv(curr_met_data$Est,curr_met_data$var,
mods=curr_met_data$is_targeted,method="FE")})
try({re_model_tar = rma.uni(
curr_met_data[curr_met_data$is_targeted==1,"Est"],
curr_met_data[curr_met_data$is_targeted==1,"var"],
method="FE"
)})
try({re_model_untar = rma.uni(
curr_met_data[curr_met_data$is_targeted==0,"Est"],
curr_met_data[curr_met_data$is_targeted==0,"var"],
method="FE"
)})
meta_analysis_stats[[paste(metabolite,tissue,tp,sep=",")]] =
list(curr_met_data=curr_met_data,re_model1=re_model1,
re_model2 = re_model2,re_model_tar=re_model_tar,
re_model_untar = re_model_untar)
}
}
}
norm_method2comparison_results[[pair_name]]$meta_analysis_stats =
meta_analysis_stats
}
We first plot the number and percentage of metabolites in the targeted datasets that are measured in at least one untargeted dataset.
library(ggplot2)
dataset2num_metabolites = sapply(metabolomics_processed_datasets,
function(x)nrow(x$sample_data))
named_dataset_coverage = sapply(named2covered_shared_metabolites,length)
named_dataset_coverage = data.frame(
name = names(named_dataset_coverage),
percentage = named_dataset_coverage /
dataset2num_metabolites[names(named_dataset_coverage)],
count = named_dataset_coverage,
total = dataset2num_metabolites[names(named_dataset_coverage)]
)
# add datasets with no coverage
all_targeted_datasets = names(metabolomics_processed_datasets)
all_targeted_datasets = all_targeted_datasets[!grepl("untar",all_targeted_datasets)]
zero_coverage_datasets = setdiff(all_targeted_datasets,
named_dataset_coverage$name)
zero_coverage_datasets = data.frame(
name = zero_coverage_datasets,
percentage = rep(0,length(zero_coverage_datasets)),
count = rep(0,length(zero_coverage_datasets)),
total = dataset2num_metabolites[zero_coverage_datasets]
)
named_dataset_coverage = rbind(named_dataset_coverage,
zero_coverage_datasets)
named_dataset_coverage =
named_dataset_coverage[order(as.character(named_dataset_coverage$name)),]
print(ggplot(named_dataset_coverage, aes(x=name, y=percentage)) +
geom_bar(stat = "identity",width=0.2) + coord_flip() +
geom_text(data=named_dataset_coverage,
aes(name, percentage+0.05, label=count),
position = position_dodge(width=0.9),
size=4) +
ggtitle("Targeted dataset: coverage by untargeted"))
Compare the normalization methods by their correlation distributions.
rep_correlations = c()
tissues = unique(sapply(metabolomics_processed_datasets,function(x)x$tissue))
for(pair_name in names(norm_method2comparison_results)){
single_metabolite_corrs =
norm_method2comparison_results[[pair_name]]$single_metabolite_corrs
for(tissue in tissues){
curr_datasets = names(single_metabolite_corrs)[
grepl(tissue,names(single_metabolite_corrs))
]
between_corrs = c()
for(tar_dataset in curr_datasets){
l = single_metabolite_corrs[[tar_dataset]]
between_corrs = c(between_corrs,unname(unlist(sapply(l,diag))))
}
rep_correlations = rbind(rep_correlations,
c(pair_name,tissue,mean(between_corrs,na.rm=T),sd(between_corrs,na.rm=T)))
}
}
rep_correlations = rep_correlations[!is.na(rep_correlations[,3]),]
rep_correlations = data.frame(
"NormMethod" = rep_correlations[,1],
"Tissue" = rep_correlations[,2],
"Mean" = as.numeric(rep_correlations[,3]),
"SD" = as.numeric(rep_correlations[,4])
)
rep_correlations = rep_correlations[order(rep_correlations$Tissue),]
print(
ggplot(rep_correlations, aes(x=Tissue, y=Mean, fill=NormMethod)) +
geom_bar(position=position_dodge(), stat="identity", colour='black') +
geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD),na.rm=T,
width=.2,position=position_dodge(.9))
)
pthr = 0.001
for(tissue in tissues){
for(pair_name in names(norm_method2comparison_results)){
meta_analysis_stats =
norm_method2comparison_results[[pair_name]]$meta_analysis_stats
meta_analysis_stats = meta_analysis_stats[
grepl(tissue,names(meta_analysis_stats))
]
if(length(meta_analysis_stats)==0){next}
naive_analysis_tar = sapply(meta_analysis_stats,
function(x)sum(x$curr_met_data[,"Pvalue"]<pthr &
x$curr_met_data[,"is_targeted"]))
naive_analysis_untar = sapply(meta_analysis_stats,
function(x)sum(x$curr_met_data[,"Pvalue"]<pthr &
!x$curr_met_data[,"is_targeted"]))
tb = table(naive_analysis_tar,naive_analysis_untar)
nonsig_in_both = tb[1,1]
tb[1,1] = 0
barplot(t(tb),
legend=T,xlab="Number of targeted datasets with p<0.001",
ylab = "log2 number of metabolites",
main = paste(tissue,pair_name))
}
}
library(VennDiagram)
require(gridExtra)
pthr = 0.001
tissues = unique(sapply(metabolomics_processed_datasets,function(x)x$tissue))
tar_vs_untar_correlations = c()
for(tissue in tissues){
for(pair_name in names(norm_method2comparison_results)){
meta_analysis_stats =
norm_method2comparison_results[[pair_name]]$meta_analysis_stats
meta_analysis_stats = meta_analysis_stats[
grepl(tissue,names(meta_analysis_stats))
]
if(is.null(meta_analysis_stats) || length(meta_analysis_stats)==0){next}
# P-value for the difference between targeted and untargeted
targeted_diff_p =
sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])
# P-values - targeted vs. untargeted
pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])
significant_in = rep("None",length(pvals_untar))
significant_in[pvals_tar<pthr] = "Targeted"
significant_in[pvals_untar<pthr] = "Untargeted"
significant_in[pvals_tar<pthr & pvals_untar<pthr] = "Both"
significant_diff = targeted_diff_p<pthr
rho = cor(-log10(pvals_tar),-log10(pvals_untar),method = "pearson")
# Betas - targeted vs. untargeted
betas_tar =
sapply(meta_analysis_stats,function(x)x$re_model_tar$beta[1,1])
betas_untar =
sapply(meta_analysis_stats,function(x)x$re_model_untar$beta[1,1])
betas_untar = unlist(betas_untar[sapply(betas_untar,length)>0])
df = data.frame(
targeted = betas_tar,
untargeted = betas_untar,
significant_in = significant_in,
significant_diff = significant_diff
)
rho_beta = cor(betas_untar,betas_tar,method = "pearson")
rhop = cor.test(betas_untar,betas_tar,method = "pearson")$p.value
print(
ggplot(df, aes(x=targeted, y=untargeted,
shape=significant_diff, color=significant_in)) +
geom_point() + geom_abline(slope=1,intercept = 0) +
ggtitle(paste(tissue, pair_name,
"effects, rho=:",format(rho_beta,digits=2)))
)
tar_vs_untar_correlations = rbind(tar_vs_untar_correlations,
c(tissue,pair_name,rho,rho_beta))
# draw a venn diagram
inter_area = sum(significant_in=="Both")
tar_area = sum(significant_in=="Targeted") + inter_area
untar_area = sum(significant_in=="Untargeted") + inter_area
subt = paste("Not significant in both:",table(significant_in)["None"])
venng = draw.pairwise.venn(tar_area,untar_area,inter_area,
c("Targeted","Untargeted"),lty = rep("blank",2),
fill = c("pink", "cyan"), alpha = rep(0.5, 2),
cat.dist = rep(0.01, 2),ind=F)
# grid.newpage()
grid.arrange(gTree(children=venng),
top=paste(tissue,pair_name), bottom=subt)
}
}
We examine the average correlation between the platforms (within tissues). Whenever two platforms share more than a single metabolite we plot both the average correlation between the same metabolites and between other metabolites. Adding the average correlation between platforms but with different metabolites is important as it gives some perspective to what a significant correlation is. That is, in many cases below, the average correlation may be greater than expected.
# Next examine the Spearman correlations between platforms
mean_abs<-function(x,...){return(mean(abs(x),...))}
sd_abs<-function(x,...){return(sd(abs(x),...))}
extract_diag_vs_non_diag<-function(corrs,func=mean,...){
if(length(corrs)==1){
return(c(same=func(corrs,...),other=NA))
}
same = func(diag(corrs),...)
other = func(
c(corrs[lower.tri(corrs,diag = F)]),...)
return(c(same=same,other=other))
}
single_metabolite_corrs =
norm_method2comparison_results$`log2,imp;log2,imp`$single_metabolite_corrs
for(tar_dataset in names(single_metabolite_corrs)){
l = single_metabolite_corrs[[tar_dataset]]
if(length(l)==0){next}
corr_info = as.data.frame(t(sapply(l, extract_diag_vs_non_diag)))
corr_sd = as.data.frame(t(sapply(l, extract_diag_vs_non_diag,func=sd)))
# shorten the row names
rownames(corr_info) = sapply(rownames(corr_info),
function(x)paste(strsplit(x,split=",")[[1]][3:4],collapse=","))
rownames(corr_sd) = rownames(corr_info)
corr_info$dataset = rownames(corr_info)
corr_sd$dataset = corr_info$dataset
corr_info = melt(corr_info)
corr_sd = melt(corr_sd)
corr_info$sd = corr_sd$value
print(
ggplot(corr_info, aes(x=dataset, y=value, fill=variable)) +
geom_bar(position=position_dodge(), stat="identity", colour='black') +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd),na.rm=T,
width=.2,position=position_dodge(.9)) +
ggtitle(tar_dataset) + xlab("Untargeted dataset") + ylab("Spearman") +
labs(fill = "Pair type") +
theme(legend.position="top",legend.direction = "horizontal")
)
}
Here are the results for lactate in plasma.
Example from a log2 dataset:
meta_analysis_stats =
norm_method2comparison_results$`log2,imp;med,log,imp`$meta_analysis_stats
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
curr_labels = gsub("plasma,","",
meta_analysis_stats[[lact_example]][[1]][,1])
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = curr_labels,
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
Example from a scaled dataset:
meta_analysis_stats =
norm_method2comparison_results$`imp,none;imp,none`$meta_analysis_stats
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
curr_labels = gsub("plasma,","",
meta_analysis_stats[[lact_example]][[1]][,1])
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = curr_labels,
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
We can now check the same analysis for liver:
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("liver",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
curr_labels = gsub("liver_powder,","",
meta_analysis_stats[[lact_example]][[1]][,1])
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = curr_labels,
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
From the plots above we take the most extreme examples and examine their forest plots.
meta_analysis_stats =
norm_method2comparison_results$`imp,none;imp,none`$meta_analysis_stats
# P-value for the difference between targeted and untargeted
targeted_diff_p =
sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])
# P-values - targeted vs. untargeted
pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])
agree_example = names(sample(which(pvals_tar< 1e-5 & pvals_untar < 1e-5 &
targeted_diff_p > 0.1))[1])
simplify_labels_for_forest<-function(s){
s = gsub(",untargeted","",s)
tissue = strsplit(s,split=",")[[1]][1]
s = gsub(paste(tissue,",",sep=""),"",s)
return(s)
}
forest(meta_analysis_stats[[agree_example]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[agree_example]][[1]][,1]),
main = paste(agree_example,"significant in both, tar and untar agree",sep="\n"),
xlab = "Log fc",col = "blue")
agree_p_disagree_beta = names(sample(which(pvals_tar< 1e-5 & pvals_untar < 1e-5 &
targeted_diff_p < 0.001))[1])
forest(meta_analysis_stats[[agree_p_disagree_beta]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[agree_p_disagree_beta]][[1]][,1]),
main = paste(agree_p_disagree_beta,
"significant in both, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
disagree_example1 = names(sample(which(pvals_tar< 1e-10 & pvals_untar >0.1))[1])
forest(meta_analysis_stats[[disagree_example1]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[disagree_example1]][[1]][,1]),
main = paste(disagree_example1,
"significant targeted, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
disagree_example2 = names(sample(which(pvals_tar > 0.1 & pvals_untar < 1e-20))[1])
forest(meta_analysis_stats[[disagree_example2]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[disagree_example2]][[1]][,1]),
main = paste(disagree_example2,
"significant in untargeted, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
Use 5-fold cross validation for analysis within tissues. For each pair of targeted and untargeted datasets from the same tissue, we use the untargeted data as the predictive features and all metabolites in the targeted datasets as the dependent variables. The code below uses feature selection and random forests to train the predictive models.
nfolds = 5
prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
for(nn2 in names(metabolomics_processed_datasets)){
if(nn2 == nn1){next}
if(!grepl("untargeted",nn2)){next}
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
nn2_dataset = strsplit(nn2,split=",")[[1]][2]
if(nn1_tissue!=nn2_tissue){next}
print(paste("features from:",nn2))
print(paste("labels from:",nn1))
# get the numeric datasets and their annotation
y = metabolomics_processed_datasets[[nn1]]$normalized_data[[1]]
x = metabolomics_processed_datasets[[nn2]]$normalized_data[[1]]
# align the sample sets
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# step 1: merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
# step 2: use the shared bio ids
shared_bids = as.character(intersect(colnames(y),colnames(x)))
x = t(as.matrix(x[,shared_bids]))
y = t(as.matrix(y[,shared_bids]))
# At this point x and y are over the same BIDs, now we add the metadata
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[shared_bids,]
# take the covariates (ignore distances)
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
curr_covs$sex = y_meta$animal.registration.sex # add sex
# add the covariates into x
x = cbind(x,curr_covs)
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
if( i %% 10 == 0){print(paste("analyzing metabolite number:",i))}
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
# model = randomForest(tr_yi,x=tr_x,ntree = 20)
# te_preds = predict(model,newdata = te_x)
model = feature_selection_wrapper(tr_x,tr_yi,
coeff_of_var,randomForest,
topK = numFeatures,ntree=50)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
colnames(preds) = colnames(y)
colnames(reals) = colnames(y)
currname = paste(nn1,nn2,sep=";")
prediction_analysis_results[[currname]] = list(
preds = preds,real=real
)
}
}
save_to_bucket(prediction_analysis_results,
file="tar_vs_untar_prediction_analysis_results.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
We now take the predicted and real values and estimate the prediction accuracy in different ways.
prediction_analysis_results = load_from_bucket(
"tar_vs_untar_prediction_analysis_results.RData",
"gs://bic_data_analysis/pass1a/metabolomics/",F)
prediction_analysis_results = prediction_analysis_results[[1]]
results_metrics = list()
for(nn in names(prediction_analysis_results)){
preds = prediction_analysis_results[[nn]]$preds
real = prediction_analysis_results[[nn]]$real
tar_name = strsplit(nn,split=";")[[1]][1]
untar_name = strsplit(nn,split=";")[[1]][2]
y = metabolomics_processed_datasets[[tar_name]]$normalized_data[[1]]
colnames(preds) = rownames(y)
colnames(real) = rownames(y)
tar_name = simplify_metab_dataset_name(tar_name)
untar_name = simplify_metab_dataset_name(untar_name)
currtissue = strsplit(tar_name,split=",")[[1]][1]
tar_name = gsub(paste(currtissue,",",sep=""),"",tar_name)
untar_name = gsub(paste(currtissue,",",sep=""),"",untar_name)
if(! currtissue %in% names(results_metrics)){
results_metrics[[currtissue]] = list()
}
if(! tar_name %in% names(results_metrics[[currtissue]])){
results_metrics[[currtissue]][[tar_name]] = list()
}
rhos = format(diag(cor(preds,real,method="spearman")),digits=3)
rhos = as.numeric(rhos)
SEs = colSums((preds-real)^2)
MSEs = SEs / nrow(preds)
RMSE = sqrt(MSEs)
rMSE = MSEs / apply(y,1,var)
CoVs = apply(y,1,sd) / apply(y,1,mean)
discCoVs = cut(CoVs,breaks = 2,ordered_result = T)
results_metrics[[currtissue]][[tar_name]][[untar_name]] = data.frame(
rhos,MSEs,RMSE,rMSE,CoVs,discCoVs
)
}
We now present a few summary plots.
for(tissue in names(results_metrics)){
for(tar in names(results_metrics[[tissue]])){
l = results_metrics[[tissue]][[tar]]
rho_vs_cv = c()
for(untar in names(l)){
m = l[[untar]][,c("rhos","discCoVs")] # take the current matrix
m = cbind(rep(untar,nrow(m)),m)
m$discCoVs = as.numeric(m$discCoVs)
rho_vs_cv = rbind(rho_vs_cv,m)
}
colnames(rho_vs_cv)[1] = "dataset"
boxplot(rhos~discCoVs:dataset,data=rho_vs_cv,las=2,
ylab="Spearman",xlab = "",ylim=c(0,1),
main = paste(tissue,tar,sep=","))
}
}
As additional references, we train below additional models. First, we check the prediction of naive models that use technical and clinical covariates only. Second, we use multi-task regression and deep learning models.
cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
print(nn1)
y = metabolomics_processed_datasets[[nn1]]$normalized_data[[1]]
y_vials = colnames(y)
bid_y = merged_dmaqc_data[colnames(y),"bid"]
colnames(y) = bid_y
y = t(as.matrix(y))
if(ncol(y)>1000){next}
cov_cols = c("animal.registration.sex",
"acute.test.weight",
"acute.test.distance",
"animal.key.timepoint")
covs = merged_dmaqc_data[y_vials,cov_cols]
x = covs
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
print(j)
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
model = randomForest(tr_yi,x=tr_x,ntree = 20)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
cov_prediction_analysis_results[[nn1]] = list(
preds = preds,real=real
)
}
# preds = c();real=c()
# for(j in 1:nfolds){
# tr_x = x[folds!=j,]
# tr_y = y[folds!=j,]
# te_x = x[folds==j,]
# te_y = y[folds==j,]
# model = MTL_wrapper(tr_x,tr_y,type="Regression", Regularization="L21")
# te_preds = predict(model,te_x)
# real = rbind(real,te_y)
# preds = rbind(preds,te_preds)
# }
# diag(cor(preds,real))
# Using PLS regression
# library(pls)
# pls_model = plsr(y~x,ncomp = 5,validation="LOO")
# eval = MSEP(pls_model)
#
# y_pca = prcomp(y)
# plot(y_pca)
# explained_var = y_pca$sdev^2/sum(y_pca$sdev^2)
# y_pca_matrix = y_pca$x[,1:10]
#
# # regress out sex, weight
#
# get_explained_variance_using_PCA(x,y)
# x = apply(x,2,regress_out,covs=covs)
# y = apply(y,2,regress_out,covs=covs)
# get_explained_variance_using_PCA(x,y)